This workbook presents the first part of a basic work flow for scRNAseq analysis 1. Loading data 2. Quality Control 2.1 Cell filtering 2.2 Doublet assessment 3. Normalization 4. Feature selection 5. Dimensionality reduction
Example data set description:
Load your required libraries. You need to have these libraries installed.
library(Seurat)
library(tidyverse)
library(DoubletFinder)
Load the data: Starting with the adolescent data.
# load data
# insert the pathway to the location of the data
data_path <- "/Users/rhalenathomas/Documents/scRNAclubMcGill/workshop2023/data/Adolescent_14_YO_raw_feature_bc_matrix"
adolescent_data <- Read10X(data.dir = data_path,
strip.suffix = TRUE )
|--------------------------------------------------|
|==================================================|
#Look at the dimensions of the matrix
dim(adolescent_data)
[1] 33538 6794880
#Look at a small part of the data
adolescent_data[1:5, 1:5]
5 x 5 sparse Matrix of class "dgCMatrix"
AAACCCAAGAAACACT AAACCCAAGAAACCAT AAACCCAAGAAACCCA AAACCCAAGAAACCCG
MIR1302-2HG . . . .
FAM138A . . . .
OR4F5 . . . .
AL627309.1 . . . .
AL627309.3 . . . .
AAACCCAAGAAACCTG
MIR1302-2HG .
FAM138A .
OR4F5 .
AL627309.1 .
AL627309.3 .
#Look at the distribution of the number of UMIs per cell
colSums(adolescent_data) %>% summary
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 0.000 4.177 0.000 28647.000
#Look at the distribution of the number of genes per cell
colSums(adolescent_data > 0) %>% summary
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 0.000 2.074 0.000 6315.000
Remove barcodes with too few genes that could be empty droplets
#Remove barcodes with less than 200 genes detected (you can select a different value here)
#You could use something more refined here, like EmptyDrops
adolescent_data <- adolescent_data[, colSums(adolescent_data > 0)> 200]
dim(adolescent_data)
[1] 33538 7371
Now we have gone from 6794880 barcodes to 7371 barcodes. These barcodes now represent cells.
Filter genes and create a Seurat object
#We might not want to include genes that occur in few cells are no cells. Here we will filter out genes/transcripts that are in less than 3 cells.
#Make a Seurat object
#Removing any genes detected in less than 3 cells
adolescent_data_seurat <- CreateSeuratObject(adolescent_data, project = "Adolescent", min.cells = 3)
# look at the object dimensions
adolescent_data_seurat
An object of class Seurat
19866 features across 7371 samples within 1 assay
Active assay: RNA (19866 features, 0 variable features)
We have now gone from 33538 RNA transcripts/genes to 19866 genes.
Now we will look at some metadata in the seurat object
#Look at some metadata
adolescent_data_seurat@meta.data %>% names
[1] "orig.ident" "nCount_RNA" "nFeature_RNA"
# there are the meta data we currently have in our seurat object
Data distribution
# look at the distribution of total counts of RNA across cells
adolescent_data_seurat$nCount_RNA %>% summary
Min. 1st Qu. Median Mean 3rd Qu. Max.
228 1866 3080 3459 4486 28645
# look at the distribution of unique RNA transcripts across cells
adolescent_data_seurat$nFeature_RNA %>% summary
Min. 1st Qu. Median Mean 3rd Qu. Max.
201 999 1361 1558 2132 6315
Visualize the distributions
VlnPlot(adolescent_data_seurat, features = c("nCount_RNA","nFeature_RNA"), pt.size = 0)
Repeat all these steps with the adult data.
#Repeat above steps for the adult dataset, use a minimum genes per cell barcode cutoff of 200
#With real data you would use the same cutoff across samples
#You would decide the cutoff based on what works best across your samples.
#Can be left as an exercise i.e. no code provided until after the workshop or code provided in a different file
data_path2 <- "/Users/rhalenathomas/Documents/scRNAclubMcGill/workshop2023/data/Adult_41_YO_raw_feature_bc_matrix"
adult_data <- Read10X(data.dir = data_path2,
strip.suffix = TRUE)
dim(adult_data)
[1] 33538 6794880
adult_data[1:5, 1:5]
5 x 5 sparse Matrix of class "dgCMatrix"
AAACCCAAGAAACACT AAACCCAAGAAACCAT AAACCCAAGAAACCCA AAACCCAAGAAACCCG
MIR1302-2HG . . . .
FAM138A . . . .
OR4F5 . . . .
AL627309.1 . . . .
AL627309.3 . . . .
AAACCCAAGAAACCTG
MIR1302-2HG .
FAM138A .
OR4F5 .
AL627309.1 .
AL627309.3 .
colSums(adult_data) %>% summary
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 0.000 1.538 0.000 22673.000
colSums(adult_data > 0) %>% summary
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 0.000 0.798 0.000 5844.000
adult_data <- adult_data[, colSums(adult_data > 0)> 200]
dim(adult_data)
[1] 33538 2201
adult_data_seurat <- CreateSeuratObject(adult_data, project = "Adult", min.cells = 3)
adult_data_seurat
An object of class Seurat
17269 features across 2201 samples within 1 assay
Active assay: RNA (17269 features, 0 variable features)
adult_data_seurat@meta.data %>% names
[1] "orig.ident" "nCount_RNA" "nFeature_RNA"
adult_data_seurat$nCount_RNA %>% summary
Min. 1st Qu. Median Mean 3rd Qu. Max.
226 1213 3235 3711 5444 22652
Now we want to add some sample meta data
#Make a tibble with the age info
colnames(adolescent_data_seurat) %>% head
[1] "AAACCCAAGATCGCTT" "AAACCCAAGGTTCAGG" "AAACCCACATGATCTG" "AAACCCAGTAACTAAG"
[5] "AAACCCAGTCAACACT" "AAACCCAGTCCAATCA"
adolescent_sample_metadata <- tibble(Cell_barcodes = colnames(adolescent_data_seurat),
age = rep(14, dim(adolescent_data_seurat)[2]))
head(adolescent_sample_metadata)
# this sample is from one brain and the age is 14 years so we will add that data for all the cells
The cells have previously been annotated - we will add in those annotations
# input file path
input_path <- "/Users/rhalenathomas/Documents/scRNAclubMcGill/workshop2023/drive-download-20230406T132728Z-001/"
input_file <- paste(input_path,"HA799_14YO_metadata.csv",sep="")
#Read in metadata file with original cluster info
adolescent_sample_clusters <- read_csv(file = input_file)
New names:
• `` -> `...1`
Rows: 4453 Columns: 8
── Column specification ──────────────────────────────────────────────────────────
Delimiter: ","
chr (3): ...1, orig.ident, Cell_type
dbl (5): nCount_RNA, nFeature_RNA, percent.mt, RNA_snn_res.0.6, seurat_clusters
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(adolescent_sample_clusters)
NA
NA
NA
NA
Select the columns we need
adolescent_sample_clusters <- adolescent_sample_clusters %>% select(-c(2:7))
colnames(adolescent_sample_clusters) <- c("Cell_barcodes","Original_clusterID")
head(adolescent_sample_clusters)
NA
NA
Now we will combine the two meta data ojbects and add the meta data into the Seurat object
#Combine the age and original cluster metadata into a single object
#Note that not all the cells we have retained were present in the original analysis, we will keep all our called cells
adolescent_sample_metadata %>% left_join(adolescent_sample_clusters, by = "Cell_barcodes") %>%
column_to_rownames("Cell_barcodes") %>% as.data.frame -> adolescent_sample_metadata
adolescent_sample_metadata %>% head
#Here we add the metadata to our Seurat object
adolescent_data_seurat <- AddMetaData(adolescent_data_seurat, metadata = adolescent_sample_metadata)
adolescent_data_seurat@meta.data %>% head
NA
NA
#Remove the original matrix, and other unnecessary objects to clean up space
rm(adolescent_data, adolescent_sample_metadata, adolescent_sample_clusters)
Repeat adding meta data for the adult object
#Repeat the process of adding metadata for the adult sample
input_path <- "/Users/rhalenathomas/Documents/scRNAclubMcGill/workshop2023/drive-download-20230406T132728Z-001/"
input_file <- paste(input_path,"HA801_41YO_metadata.csv",sep="")
adult_sample_metadata <- tibble(CellName = colnames(adult_data_seurat),
age = rep(41, dim(adult_data_seurat)[2]))
adult_sample_clusters <- read_csv(file = input_file) %>% select(-c(2:7))
New names:
• `` -> `...1`
• `` -> `...8`
Rows: 899 Columns: 8
── Column specification ──────────────────────────────────────────────────────────
Delimiter: ","
chr (3): ...1, orig.ident, ...8
dbl (5): nCount_RNA, nFeature_RNA, percent.mt, RNA_snn_res.0.6, seurat_clusters
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(adult_sample_clusters) <- c("CellName", "OriginalCluster")
adult_sample_metadata %>% full_join(adult_sample_clusters, by = "CellName") %>%
column_to_rownames("CellName") %>% as.data.frame -> adult_sample_metadata
adult_data_seurat <- AddMetaData(adult_data_seurat, metadata = adult_sample_metadata)
adult_data_seurat@meta.data %>% head
# clean up extra files
rm(adult_data, adult_sample_clusters, adult_sample_metadata)
Filter out unwanted cells
# Example cell filtering based on mitochondrial count percentage and number of UMIs ----------
#Calculate the percentage of mitochondrially encoded mitochondrial genes
adolescent_data_seurat <- PercentageFeatureSet(adolescent_data_seurat, pattern = "^MT-", col.name = "percent.MT")
adolescent_data_seurat$percent.MT %>% summary
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 4.118 6.381 7.654 9.558 63.859
Visualize the mitochondria
VlnPlot(adolescent_data_seurat, features = "percent.MT", pt.size = 0.001)
#Remove any cells with more than 20% mitochondrial counts
adolescent_data_seurat <- subset(adolescent_data_seurat, percent.MT < 20)
#Remove cells with very high UMI counts, which may be possible multiplets
adolescent_data_seurat <- subset(adolescent_data_seurat, nCount_RNA < 20000)
# see the results
VlnPlot(adolescent_data_seurat, features = c("percent.MT", "nCount_RNA", "nFeature_RNA"), pt.size = 0.001)
NA
NA
Repeat filtering for the adult sample
#Repeat filtering based on mitochondrial genes and number of UMIs for the adult sample
#Use the same criteria as we used for the adolescent sample
#Can be left as an exercise
adult_data_seurat <- PercentageFeatureSet(adult_data_seurat, pattern = "^MT-", col.name = "percent.MT")
adult_data_seurat$percent.MT %>% summary
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 4.232 7.094 10.299 12.292 82.212
adult_data_seurat <- subset(adult_data_seurat, percent.MT < 20)
VlnPlot(adult_data_seurat, features = c("percent.MT", "nCount_RNA", "nFeature_RNA"))
adult_data_seurat <- subset(adult_data_seurat, nCount_RNA < 20000)
VlnPlot(adult_data_seurat, features = c("percent.MT", "nCount_RNA", "nFeature_RNA"))
NA
NA
Cell cycle scoring (optional)
adolescent_data_seurat <- CellCycleScoring(adolescent_data_seurat, g2m.features = cc.genes$g2m.genes, s.features = cc.genes$s.genes)
Warning: The following features are not present in the object: CDCA7, DTL, MLF1IP, RAD51, CDC45, EXO1, E2F8, not searching for symbol synonyms
Warning: The following features are not present in the object: TOP2A, NDC80, MKI67, FAM64A, HN1, CDC25C, DLGAP5, not searching for symbol synonyms
VlnPlot(adolescent_data_seurat, features = c("S.Score", "G2M.Score"))
adult_data_seurat <- CellCycleScoring(adult_data_seurat, g2m.features = cc.genes$g2m.genes, s.features = cc.genes$s.genes)
Warning: The following features are not present in the object: DTL, MLF1IP, RAD51, RRM2, CDC45, CDC6, EXO1, E2F8, not searching for symbol synonyms
Warning: The following features are not present in the object: CDK1, TPX2, TOP2A, NDC80, MKI67, FAM64A, CKAP2L, AURKB, GTSE1, HJURP, HN1, TTK, CDC25C, KIF2C, DLGAP5, CDCA2, NEK2, not searching for symbol synonyms
VlnPlot(adult_data_seurat, features = c("S.Score", "G2M.Score"))
Data normalization
# Normalize data (log normalization) and select genes with variable expression across cells --------------------------------------
adolescent_data_seurat <- NormalizeData(adolescent_data_seurat, normalization.method = "LogNormalize", scale.factor = 10000)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
#Check out the effect of normalization
GetAssayData(adolescent_data_seurat, assay = "RNA", slot = "data") %>% expm1 %>% colSums %>% head
AAACCCAAGATCGCTT AAACCCAAGGTTCAGG AAACCCACATGATCTG AAACCCAGTAACTAAG
10000 10000 10000 10000
AAACCCAGTCAACACT AAACCCAGTCCAATCA
10000 10000
GetAssayData(adolescent_data_seurat, assay = "RNA", slot = "counts") %>% colSums %>% head
AAACCCAAGATCGCTT AAACCCAAGGTTCAGG AAACCCACATGATCTG AAACCCAGTAACTAAG
3212 4807 1945 7173
AAACCCAGTCAACACT AAACCCAGTCCAATCA
312 3350
Finding Variable feature with two different methods
# Dispersion
#Find and plot variable features (in our case genes) with dispersion based method
adolescent_data_seurat <- FindVariableFeatures(adolescent_data_seurat, selection.method = "disp", nfeatures = 2000)
Calculating gene means
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variance to mean ratios
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
VariableFeaturePlot(adolescent_data_seurat)
disp_var <- VariableFeatures(adolescent_data_seurat)
# VST
#Find and plot variable features (in our case genes) with vst based method
adolescent_data_seurat <- FindVariableFeatures(adolescent_data_seurat, selection.method = "vst", nfeatures = 2000)
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
VariableFeaturePlot(adolescent_data_seurat)
Compare both
#Compare the two methods, look at some of the variable genes
intersect(disp_var, VariableFeatures(adolescent_data_seurat)) %>% length
[1] 745
VariableFeatures(adolescent_data_seurat) %>% head(n = 20)
[1] "SPARCL1" "SLC1A2" "CLU" "CPE" "GPR17" "BCAN" "LYZ"
[8] "SCRG1" "HSPA6" "GJA1" "GPNMB" "S100A9" "ATP1B2" "GJB6"
[15] "S100A8" "WIF1" "CCL2" "ETNPPL" "TUBB2B" "AREG"
Repeat for the adult sample
#Repeat normalization and variable feature selection for adult sample
#Use method vst and 2000 variable features
#Can be left as an exercise
adult_data_seurat <- NormalizeData(adult_data_seurat, normalization.method = "LogNormalize", scale.factor = 10000)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
adult_data_seurat <- FindVariableFeatures(adult_data_seurat, selection.method = "vst", nfeatures = 2000)
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
VariableFeaturePlot(adult_data_seurat)
VariableFeatures(adult_data_seurat) %>% head(n = 20)
[1] "HSPA6" "GPNMB" "VIM" "CXCL8" "TUBB2B" "CCL5" "CXCL2"
[8] "IFIT3" "SLC1A2" "IRF1" "SERPINE1" "MT2A" "G0S2" "S100A8"
[15] "CCL4" "FAM43A" "S100A4" "IL1A" "SOX4" "VCAN"
Dimensionality reduction PCA and UMAP
#Scaling is recommended before PCA, as otherwise highly expressed genes will have a disproportionate effect
adolescent_data_seurat <- ScaleData(adolescent_data_seurat, vars.to.regress = "percent.MT")
Regressing out percent.MT
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======== | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========== | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 16%
|
|============ | 17%
|
|============= | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|=============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 23%
|
|================= | 24%
|
|================== | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 26%
|
|=================== | 27%
|
|==================== | 27%
|
|==================== | 28%
|
|===================== | 28%
|
|===================== | 29%
|
|===================== | 30%
|
|====================== | 30%
|
|====================== | 31%
|
|======================= | 31%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 33%
|
|======================== | 34%
|
|========================= | 34%
|
|========================= | 35%
|
|========================== | 35%
|
|========================== | 36%
|
|========================== | 37%
|
|=========================== | 37%
|
|=========================== | 38%
|
|============================ | 38%
|
|============================ | 39%
|
|============================ | 40%
|
|============================= | 40%
|
|============================= | 41%
|
|============================== | 41%
|
|============================== | 42%
|
|=============================== | 42%
|
|=============================== | 43%
|
|=============================== | 44%
|
|================================ | 44%
|
|================================ | 45%
|
|================================= | 45%
|
|================================= | 46%
|
|================================== | 47%
|
|================================== | 48%
|
|=================================== | 48%
|
|=================================== | 49%
|
|==================================== | 49%
|
|==================================== | 50%
|
|==================================== | 51%
|
|===================================== | 51%
|
|===================================== | 52%
|
|====================================== | 52%
|
|====================================== | 53%
|
|======================================= | 54%
|
|======================================= | 55%
|
|======================================== | 55%
|
|======================================== | 56%
|
|========================================= | 56%
|
|========================================= | 57%
|
|========================================= | 58%
|
|========================================== | 58%
|
|========================================== | 59%
|
|=========================================== | 59%
|
|=========================================== | 60%
|
|============================================ | 60%
|
|============================================ | 61%
|
|============================================ | 62%
|
|============================================= | 62%
|
|============================================= | 63%
|
|============================================== | 63%
|
|============================================== | 64%
|
|============================================== | 65%
|
|=============================================== | 65%
|
|=============================================== | 66%
|
|================================================ | 66%
|
|================================================ | 67%
|
|================================================= | 67%
|
|================================================= | 68%
|
|================================================= | 69%
|
|================================================== | 69%
|
|================================================== | 70%
|
|=================================================== | 70%
|
|=================================================== | 71%
|
|=================================================== | 72%
|
|==================================================== | 72%
|
|==================================================== | 73%
|
|===================================================== | 73%
|
|===================================================== | 74%
|
|====================================================== | 74%
|
|====================================================== | 75%
|
|====================================================== | 76%
|
|======================================================= | 76%
|
|======================================================= | 77%
|
|======================================================== | 77%
|
|======================================================== | 78%
|
|========================================================= | 78%
|
|========================================================= | 79%
|
|========================================================= | 80%
|
|========================================================== | 80%
|
|========================================================== | 81%
|
|=========================================================== | 81%
|
|=========================================================== | 82%
|
|=========================================================== | 83%
|
|============================================================ | 83%
|
|============================================================ | 84%
|
|============================================================= | 84%
|
|============================================================= | 85%
|
|============================================================== | 85%
|
|============================================================== | 86%
|
|============================================================== | 87%
|
|=============================================================== | 87%
|
|=============================================================== | 88%
|
|================================================================ | 88%
|
|================================================================ | 89%
|
|================================================================ | 90%
|
|================================================================= | 90%
|
|================================================================= | 91%
|
|================================================================== | 91%
|
|================================================================== | 92%
|
|=================================================================== | 92%
|
|=================================================================== | 93%
|
|=================================================================== | 94%
|
|==================================================================== | 94%
|
|==================================================================== | 95%
|
|===================================================================== | 95%
|
|===================================================================== | 96%
|
|====================================================================== | 97%
|
|====================================================================== | 98%
|
|======================================================================= | 98%
|
|======================================================================= | 99%
|
|========================================================================| 99%
|
|========================================================================| 100%
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
adolescent_data_seurat@assays$RNA@scale.data %>% dim
[1] 2000 7101
#Linear dimensionality reduction
#Choosing the number of PCs can depend on how many cells you have
adolescent_data_seurat <- RunPCA(adolescent_data_seurat, assay = "RNA", npcs = 30)
PC_ 1
Positive: APLP1, PCDH9, EDIL3, TF, PPP1R14A, CNP, GPM6B, MAG, MARCKSL1, TUBB4A
ENPP2, AMER2, CLDN11, SCD, SLAIN1, PTGDS, ERMN, CNTN2, MOG, CD9
MAL, GPRC5B, HSPA2, TTLL7, GPR37, CRYAB, PLP1, PLLP, SEPT4, PRR18
Negative: CEBPD, CD74, CEBPB, SAT1, RGS1, SRGN, KLF6, APOE, CST3, KLF2
HLA-DRA, CH25H, DUSP1, CD83, HLA-DRB1, JUN, CCL3, HLA-DPA1, ID2, P2RY13
ZFP36L1, MIS18BP1, SPRY1, PDK4, EGR3, GPR183, CLEC7A, AC020916.1, SAMSN1, GRASP
PC_ 2
Positive: GPM6A, OLFM2, PTPRZ1, NRXN1, BCAN, SERPINE2, SPARCL1, SCG3, CPE, ATP1A2
NKAIN4, SLC1A2, CLDN10, TMEM100, HIF3A, FABP7, CHL1, ATP1B2, TUBB2B, TNR
NRCAM, FGFR3, EDNRB, C1orf61, ACSL6, TIMP3, SCRG1, LUZP2, ENHO, MT1M
Negative: SPP1, RNASE1, DUSP1, EVI2A, TMEM144, MAG, ERMN, ENPP2, CARNS1, ANLN
FOSB, NKX6-2, OPALIN, MOBP, HHIP, CLDN11, PTMA, RGS1, PIP4K2A, NECAB1
CNDP1, HSP90AA1, RAPGEF5, CD74, FAM107B, CNTN2, GPR37, SEPT4, SRGN, GAS7
PC_ 3
Positive: TNR, PDGFRA, MEG3, VCAN, C1QL1, COL9A1, LY6H, CA10, GPR17, PCDH15
FGF12, MEGF11, ATCAY, MMP16, RASD1, AFAP1L2, FXYD6, LUZP2, GPNMB, SOX6
ASCL1, PHLDA1, ASIC4, NXPH1, MYT1, B3GNT7, SEMA5A, THY1, KCNIP4, CSPG4
Negative: GJB6, GJA1, AQP4, ETNPPL, ID4, MGST1, SLCO1C1, MGAT4C, WIF1, F3
SDC4, SLC4A4, CXCL14, AGT, PPP1R1B, ADGRV1, GABRA2, FXYD1, SLC25A18, RORB
SLC39A12, FGFR3, RANBP3L, SDC2, PDGFRB, ATP1B2, ANGPTL4, IGFBP7, BMPR1B, EFEMP1
PC_ 4
Positive: HIST1H2BG, AL034397.3, PLCG2, AC243960.1, MIR503HG, HSPA1A, HSPA1B, AL031777.3, PLEKHH2, AC110285.2
HSPB1, CPB2-AS1, ACSS1, FUT9, AC004158.1, GRIA2, CST3, CCP110, ITM2A, GOLGA8B
PLP1, ADGRL3, THBS2, PPP1R9A, PLEKHA1, TMEM144, HIST1H2BH, MOSPD2, CADM2, KCNH8
Negative: EGR1, KDM6B, CD83, GRASP, EGR3, NR4A2, JUNB, FOSB, EGR2, ZFP36
NR4A1, BTG2, NR4A3, DUSP1, GADD45B, NFKBID, ID2, CCL3L1, MAP3K8, CCL3
TRIB1, SRGN, HSPA8, PLAUR, CDKN1A, ATF3, IER2, SYAP1, NEDD9, KLF4
PC_ 5
Positive: PLAUR, BCL2A1, NR4A3, CCL4L2, CCL4, TRIB1, ITGAX, CCL3L1, TNFAIP8L3, B3GNT5
NEDD9, NFKBID, MAP3K8, ABL2, EIF4E, DUSP4, PTGER4, CD69, GRASP, VMP1
CCL2, ZNF331, BASP1, IL1B, IL1A, GPR183, DUSP5, KDM6B, OSM, C5AR1
Negative: DNAJB1, IER2, HSPA1A, HSPA1B, JUN, EGR1, IRF1, FOSB, JUNB, BTG2
HSP90AA1, SOCS3, ID1, HSPH1, EFNA1, DUSP1, NFKBIZ, ZFP36, AC116366.1, TUBB4B
LDLR, DLL1, MYADM, HSPE1, TNFSF9, MAFF, IRF9, HSPD1, CFAP20, SRSF7
#PCAPlot(adolescent_data_seurat, group.by = "Original_clusterID")
# this a confusing because we won't normally have this
PCAPlot(adolescent_data_seurat)
#Assess how many PCs capture most of the information in the data
ElbowPlot(adolescent_data_seurat, ndims = 30)
NA
NA
# Jackstraw
#Assess how many PCs capture most of the information in the data
Non-linear dimensional reduction using UMAP
#Non-linear dimensionality reduction
#Choosing how many PCs to input can depend on the elbow plot and on the number of cells
#There are many parameters that can e tweaked and optimized in a UMAP plot
#You can see some demos here: https://pair-code.github.io/understanding-umap/
adolescent_data_seurat <- RunUMAP(adolescent_data_seurat, dims = 1:10)
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
15:14:28 UMAP embedding parameters a = 0.9922 b = 1.112
15:14:28 Read 7101 rows and found 10 numeric columns
15:14:28 Using Annoy for neighbor search, n_neighbors = 30
15:14:28 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:14:29 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//RtmpglCqdx/file13ea17cf8c83
15:14:29 Searching Annoy index using 1 thread, search_k = 3000
15:14:30 Annoy recall = 100%
15:14:31 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
15:14:33 Initializing from normalized Laplacian + noise (using irlba)
15:14:33 Commencing optimization for 500 epochs, with 293486 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:14:44 Optimization finished
UMAPPlot(adolescent_data_seurat)
NA
NA
Repeat dimensional reduction with the adult sample
#Repeat dimensional reduction for adult sample, use 10 PCs for
adult_data_seurat <- ScaleData(adult_data_seurat, vars.to.regress = "percent.MT")
Regressing out percent.MT
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======== | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========== | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 16%
|
|============ | 17%
|
|============= | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|=============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 23%
|
|================= | 24%
|
|================== | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 26%
|
|=================== | 27%
|
|==================== | 27%
|
|==================== | 28%
|
|===================== | 28%
|
|===================== | 29%
|
|===================== | 30%
|
|====================== | 30%
|
|====================== | 31%
|
|======================= | 31%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 33%
|
|======================== | 34%
|
|========================= | 34%
|
|========================= | 35%
|
|========================== | 35%
|
|========================== | 36%
|
|========================== | 37%
|
|=========================== | 37%
|
|=========================== | 38%
|
|============================ | 38%
|
|============================ | 39%
|
|============================ | 40%
|
|============================= | 40%
|
|============================= | 41%
|
|============================== | 41%
|
|============================== | 42%
|
|=============================== | 42%
|
|=============================== | 43%
|
|=============================== | 44%
|
|================================ | 44%
|
|================================ | 45%
|
|================================= | 45%
|
|================================= | 46%
|
|================================== | 47%
|
|================================== | 48%
|
|=================================== | 48%
|
|=================================== | 49%
|
|==================================== | 49%
|
|==================================== | 50%
|
|==================================== | 51%
|
|===================================== | 51%
|
|===================================== | 52%
|
|====================================== | 52%
|
|====================================== | 53%
|
|======================================= | 54%
|
|======================================= | 55%
|
|======================================== | 55%
|
|======================================== | 56%
|
|========================================= | 56%
|
|========================================= | 57%
|
|========================================= | 58%
|
|========================================== | 58%
|
|========================================== | 59%
|
|=========================================== | 59%
|
|=========================================== | 60%
|
|============================================ | 60%
|
|============================================ | 61%
|
|============================================ | 62%
|
|============================================= | 62%
|
|============================================= | 63%
|
|============================================== | 63%
|
|============================================== | 64%
|
|============================================== | 65%
|
|=============================================== | 65%
|
|=============================================== | 66%
|
|================================================ | 66%
|
|================================================ | 67%
|
|================================================= | 67%
|
|================================================= | 68%
|
|================================================= | 69%
|
|================================================== | 69%
|
|================================================== | 70%
|
|=================================================== | 70%
|
|=================================================== | 71%
|
|=================================================== | 72%
|
|==================================================== | 72%
|
|==================================================== | 73%
|
|===================================================== | 73%
|
|===================================================== | 74%
|
|====================================================== | 74%
|
|====================================================== | 75%
|
|====================================================== | 76%
|
|======================================================= | 76%
|
|======================================================= | 77%
|
|======================================================== | 77%
|
|======================================================== | 78%
|
|========================================================= | 78%
|
|========================================================= | 79%
|
|========================================================= | 80%
|
|========================================================== | 80%
|
|========================================================== | 81%
|
|=========================================================== | 81%
|
|=========================================================== | 82%
|
|=========================================================== | 83%
|
|============================================================ | 83%
|
|============================================================ | 84%
|
|============================================================= | 84%
|
|============================================================= | 85%
|
|============================================================== | 85%
|
|============================================================== | 86%
|
|============================================================== | 87%
|
|=============================================================== | 87%
|
|=============================================================== | 88%
|
|================================================================ | 88%
|
|================================================================ | 89%
|
|================================================================ | 90%
|
|================================================================= | 90%
|
|================================================================= | 91%
|
|================================================================== | 91%
|
|================================================================== | 92%
|
|=================================================================== | 92%
|
|=================================================================== | 93%
|
|=================================================================== | 94%
|
|==================================================================== | 94%
|
|==================================================================== | 95%
|
|===================================================================== | 95%
|
|===================================================================== | 96%
|
|====================================================================== | 97%
|
|====================================================================== | 98%
|
|======================================================================= | 98%
|
|======================================================================= | 99%
|
|========================================================================| 99%
|
|========================================================================| 100%
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
adult_data_seurat <- RunPCA(adult_data_seurat, assay = "RNA", npcs = 30)
PC_ 1
Positive: HLA-DRA, HLA-DPA1, SRGN, SAT1, CST3, RGS1, CEBPB, APOE, S100A11, KLF6
CCL3, DUSP1, OLR1, KLF2, CH25H, MS4A7, CTSS, ZFP36L1, ID2, TREM2
CCL3L1, HLA-DPB1, CD83, AKAP13, SLC11A1, IER3, P2RY13, REL, CCL4L2, NFKBIA
Negative: APLP1, CNP, MAG, TF, EDIL3, GPM6B, CNTN2, GPR37, PLLP, ERMN
APOD, PCDH9, MOG, ENPP2, TUBB4A, GPRC5B, PPP1R14A, CLDN11, TTLL7, PTGDS
STMN4, PMP22, MARCKSL1, AMER2, NKX6-2, SPOCK3, UGT8, FEZ1, CRYAB, PLP1
PC_ 2
Positive: SPARCL1, GPM6A, ATP1B2, CPE, TXNIP, SLC1A2, SERPINE2, AQP4, IL32, PLCG2
CD3E, ATP1A2, CD2, CLDN10, FXYD1, WIF1, CD3D, CD52, GJA1, MGST1
SCG3, DDAH1, TRAC, IGFBP7, TUBB2B, NKG7, F3, OLFM2, FGFR3, CD8A
Negative: KDM6B, GRASP, ZFP36, CD83, FOSB, JUNB, KLF4, FOS, PPP1R15A, KLF6
DUSP1, ID2, REL, NR4A1, NR4A2, MIDN, NLRP3, MYADM, PLAUR, GADD45B
SLC11A1, IER2, B3GNT5, KLF10, KLF2, TRIB1, C5AR1, CDKN1A, GPR183, MAP3K8
PC_ 3
Positive: IL32, CD3E, CCL5, CD2, CD52, S100A4, CD3D, CD8A, TRAC, NKG7
GZMA, ANXA1, KLRD1, CD48, CTSW, TRBC2, LINC01871, TRBC1, TRGC2, TUBA4A
GZMK, CD96, S100A10, CST7, VIM, RUNX3, S100A6, GZMM, ETS1, BCL11B
Negative: SPP1, HLA-DRA, CST3, APOE, HLA-DPA1, TREM2, SAT1, CX3CR1, P2RY13, CH25H
FCGR1A, PDK4, CD14, APOC1, VSIG4, DHRS9, USP53, CCL3, MRC1, MS4A7
OLR1, OTUD1, GSTM3, CKB, AC011139.1, CYTL1, LYVE1, CTSS, SCIN, S100A11
PC_ 4
Positive: GPM6A, SPARCL1, SLC1A2, ATP1B2, CPE, ATP1A2, PTPRZ1, AQP4, OLFM2, TUBB2B
SERPINE2, WIF1, FGFR3, GJA1, MGST1, CLDN10, F3, ALDOC, FXYD1, ID4
GJB6, SCG3, AGT, IGFBP7, ETNPPL, MLC1, SLC4A4, NRXN1, ENHO, BCAN
Negative: SPP1, IL32, CD3E, CD2, CD8A, CCL5, TXNIP, NKG7, GZMA, CD3D
TRAC, OPALIN, PIP4K2A, KLRD1, CD52, TPPP, MOBP, SH3GL3, TRGC2, TRBC1
ZNF536, LINC01871, FAM107B, CTSW, TTLL7, CLMN, ARHGAP21, GZMK, TMEM144, CNDP1
PC_ 5
Positive: PLAUR, VEGFA, GRASP, MAP3K8, BASP1, NR4A3, SPHK1, BCL2A1, C5AR1, B3GNT5
BHLHE40, NFKB1, LCP2, SLC7A5, ADGRL3, NEDD9, S100A9, PNP, NLRP3, DUSP4
S100A8, RILPL2, AC245128.3, RAB20, BCAT1, DUSP5, IL1A, STX11, ITGAX, MAP2K3
Negative: DNAJB1, EGR1, HSPA1A, JUN, HSPA1B, IER2, IRF1, FOS, HSPH1, BTG2
IRF9, HSP90AA1, JUNB, AC007952.4, EFNA1, SOCS3, MAFF, TNFSF9, FOSB, DNAJA1
HSPD1, NFKBIZ, AC116366.1, BAG3, AC245014.3, BAMBI, GADD45B, FAM43A, ATF3, C11orf96
PCAPlot(adult_data_seurat)
ElbowPlot(adult_data_seurat)
adult_data_seurat <- RunUMAP(adult_data_seurat, dims = 1:10)
15:15:27 UMAP embedding parameters a = 0.9922 b = 1.112
15:15:27 Read 1943 rows and found 10 numeric columns
15:15:27 Using Annoy for neighbor search, n_neighbors = 30
15:15:27 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:15:27 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//RtmpglCqdx/file13ea14b11f076
15:15:27 Searching Annoy index using 1 thread, search_k = 3000
15:15:27 Annoy recall = 100%
15:15:29 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
15:15:30 Initializing from normalized Laplacian + noise (using irlba)
15:15:30 Commencing optimization for 500 epochs, with 75730 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:15:33 Optimization finished
UMAPPlot(adult_data_seurat)
Doublet removal
# Assess possible doublets -----------------------------------------------
#Using instructions here: https://github.com/chris-mcginnis-ucsf/
#First we have to find a pK which determines how big of a neighborhood will be examined for doublets
#This should be chosen for each library separately
#First we test a number of pN (proportion of generated artificial doublets) and pK
#We get different lists of probabilities of artifical nearest neighbors with these tested parameters
#Also keep in mind the results are not deterministic (every run will give slightly different results)
sweep.res.list_adolescent <- paramSweep_v3(adolescent_data_seurat, PCs = 1:10, sct = FALSE)
Loading required package: fields
Loading required package: spam
Spam version 2.9-1 (2022-08-07) is loaded.
Type 'help( Spam)' or 'demo( spam)' for a short introduction
and overview of this package.
Help for individual functions is also obtained by adding the
suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
Attaching package: ‘spam’
The following object is masked from ‘package:Matrix’:
det
The following objects are masked from ‘package:base’:
backsolve, forwardsolve
Loading required package: viridis
Loading required package: viridisLite
Try help(fields) to get started.
[1] "Creating artificial doublets for pN = 5%"
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Defining neighborhoods..."
[1] "Computing pANN across all pK..."
[1] "pK = 0.005..."
[1] "pK = 0.01..."
[1] "pK = 0.02..."
[1] "pK = 0.03..."
[1] "pK = 0.04..."
[1] "pK = 0.05..."
[1] "pK = 0.06..."
[1] "pK = 0.07..."
[1] "pK = 0.08..."
[1] "pK = 0.09..."
[1] "pK = 0.1..."
[1] "pK = 0.11..."
[1] "pK = 0.12..."
[1] "pK = 0.13..."
[1] "pK = 0.14..."
[1] "pK = 0.15..."
[1] "pK = 0.16..."
[1] "pK = 0.17..."
[1] "pK = 0.18..."
[1] "pK = 0.19..."
[1] "pK = 0.2..."
[1] "pK = 0.21..."
[1] "pK = 0.22..."
[1] "pK = 0.23..."
[1] "pK = 0.24..."
[1] "pK = 0.25..."
[1] "pK = 0.26..."
[1] "pK = 0.27..."
[1] "pK = 0.28..."
[1] "pK = 0.29..."
[1] "pK = 0.3..."
[1] "Creating artificial doublets for pN = 10%"
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Defining neighborhoods..."
[1] "Computing pANN across all pK..."
[1] "pK = 0.005..."
[1] "pK = 0.01..."
[1] "pK = 0.02..."
[1] "pK = 0.03..."
[1] "pK = 0.04..."
[1] "pK = 0.05..."
[1] "pK = 0.06..."
[1] "pK = 0.07..."
[1] "pK = 0.08..."
[1] "pK = 0.09..."
[1] "pK = 0.1..."
[1] "pK = 0.11..."
[1] "pK = 0.12..."
[1] "pK = 0.13..."
[1] "pK = 0.14..."
[1] "pK = 0.15..."
[1] "pK = 0.16..."
[1] "pK = 0.17..."
[1] "pK = 0.18..."
[1] "pK = 0.19..."
[1] "pK = 0.2..."
[1] "pK = 0.21..."
[1] "pK = 0.22..."
[1] "pK = 0.23..."
[1] "pK = 0.24..."
[1] "pK = 0.25..."
[1] "pK = 0.26..."
[1] "pK = 0.27..."
[1] "pK = 0.28..."
[1] "pK = 0.29..."
[1] "pK = 0.3..."
[1] "Creating artificial doublets for pN = 15%"
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Defining neighborhoods..."
[1] "Computing pANN across all pK..."
[1] "pK = 0.005..."
[1] "pK = 0.01..."
[1] "pK = 0.02..."
[1] "pK = 0.03..."
[1] "pK = 0.04..."
[1] "pK = 0.05..."
[1] "pK = 0.06..."
[1] "pK = 0.07..."
[1] "pK = 0.08..."
[1] "pK = 0.09..."
[1] "pK = 0.1..."
[1] "pK = 0.11..."
[1] "pK = 0.12..."
[1] "pK = 0.13..."
[1] "pK = 0.14..."
[1] "pK = 0.15..."
[1] "pK = 0.16..."
[1] "pK = 0.17..."
[1] "pK = 0.18..."
[1] "pK = 0.19..."
[1] "pK = 0.2..."
[1] "pK = 0.21..."
[1] "pK = 0.22..."
[1] "pK = 0.23..."
[1] "pK = 0.24..."
[1] "pK = 0.25..."
[1] "pK = 0.26..."
[1] "pK = 0.27..."
[1] "pK = 0.28..."
[1] "pK = 0.29..."
[1] "pK = 0.3..."
[1] "Creating artificial doublets for pN = 20%"
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Defining neighborhoods..."
[1] "Computing pANN across all pK..."
[1] "pK = 0.005..."
[1] "pK = 0.01..."
[1] "pK = 0.02..."
[1] "pK = 0.03..."
[1] "pK = 0.04..."
[1] "pK = 0.05..."
[1] "pK = 0.06..."
[1] "pK = 0.07..."
[1] "pK = 0.08..."
[1] "pK = 0.09..."
[1] "pK = 0.1..."
[1] "pK = 0.11..."
[1] "pK = 0.12..."
[1] "pK = 0.13..."
[1] "pK = 0.14..."
[1] "pK = 0.15..."
[1] "pK = 0.16..."
[1] "pK = 0.17..."
[1] "pK = 0.18..."
[1] "pK = 0.19..."
[1] "pK = 0.2..."
[1] "pK = 0.21..."
[1] "pK = 0.22..."
[1] "pK = 0.23..."
[1] "pK = 0.24..."
[1] "pK = 0.25..."
[1] "pK = 0.26..."
[1] "pK = 0.27..."
[1] "pK = 0.28..."
[1] "pK = 0.29..."
[1] "pK = 0.3..."
[1] "Creating artificial doublets for pN = 25%"
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Defining neighborhoods..."
[1] "Computing pANN across all pK..."
[1] "pK = 0.005..."
[1] "pK = 0.01..."
[1] "pK = 0.02..."
[1] "pK = 0.03..."
[1] "pK = 0.04..."
[1] "pK = 0.05..."
[1] "pK = 0.06..."
[1] "pK = 0.07..."
[1] "pK = 0.08..."
[1] "pK = 0.09..."
[1] "pK = 0.1..."
[1] "pK = 0.11..."
[1] "pK = 0.12..."
[1] "pK = 0.13..."
[1] "pK = 0.14..."
[1] "pK = 0.15..."
[1] "pK = 0.16..."
[1] "pK = 0.17..."
[1] "pK = 0.18..."
[1] "pK = 0.19..."
[1] "pK = 0.2..."
[1] "pK = 0.21..."
[1] "pK = 0.22..."
[1] "pK = 0.23..."
[1] "pK = 0.24..."
[1] "pK = 0.25..."
[1] "pK = 0.26..."
[1] "pK = 0.27..."
[1] "pK = 0.28..."
[1] "pK = 0.29..."
[1] "pK = 0.3..."
[1] "Creating artificial doublets for pN = 30%"
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Defining neighborhoods..."
[1] "Computing pANN across all pK..."
[1] "pK = 0.005..."
[1] "pK = 0.01..."
[1] "pK = 0.02..."
[1] "pK = 0.03..."
[1] "pK = 0.04..."
[1] "pK = 0.05..."
[1] "pK = 0.06..."
[1] "pK = 0.07..."
[1] "pK = 0.08..."
[1] "pK = 0.09..."
[1] "pK = 0.1..."
[1] "pK = 0.11..."
[1] "pK = 0.12..."
[1] "pK = 0.13..."
[1] "pK = 0.14..."
[1] "pK = 0.15..."
[1] "pK = 0.16..."
[1] "pK = 0.17..."
[1] "pK = 0.18..."
[1] "pK = 0.19..."
[1] "pK = 0.2..."
[1] "pK = 0.21..."
[1] "pK = 0.22..."
[1] "pK = 0.23..."
[1] "pK = 0.24..."
[1] "pK = 0.25..."
[1] "pK = 0.26..."
[1] "pK = 0.27..."
[1] "pK = 0.28..."
[1] "pK = 0.29..."
[1] "pK = 0.3..."
#We do not have the "ground truth" regarding doublets, such from from genotype data for pooled samples
#We sumamrize the performance of the range of pN=pK parameters we tested
sweep.stats_adolescent <- summarizeSweep(sweep.res.list_adolescent, GT = FALSE)
Loading required package: KernSmooth
KernSmooth 2.23 loaded
Copyright M. P. Wand 1997-2009
Loading required package: ROCR
#Here the "best" pK for the data is chosen based on a metric determined by the DoubletFinder developers
#Which performs best in datasets where the ground truth is known
bcmvn_adolescent <- find.pK(sweep.stats_adolescent)
NULL
ggplot(bcmvn_adolescent, aes(x = pK, y = BCmetric, group = "Sweep")) + geom_point() + geom_line()
#We will pick pK = 0.29
#We are not going to use our clustering information to estimate "homotypic" doublets
#We are simply going to use an expected doublet formation rate of 7.5%
nExp_poi <- round(0.075*nrow(adolescent_data_seurat@meta.data))
adolescent_data_seurat <- doubletFinder_v3(adolescent_data_seurat, PCs = 1:10, pN = 0.25, pK = 0.29, nExp = nExp_poi, reuse.pANN = FALSE, sct = FALSE)
[1] "Creating 2367 artificial doublets..."
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Computing pANN..."
[1] "Classifying doublets.."
#Here we update the Seurat object version just in case the one returned by DoubletFinder is an older version
adolescent_data_seurat <- UpdateSeuratObject(adolescent_data_seurat)
Validating object structure
Updating object slots
Ensuring keys are in the proper structure
Ensuring feature names don't have underscores or pipes
Object representation is consistent with the most current Seurat version
#Visualize and assess the cells called as probable doublets
UMAPPlot(adolescent_data_seurat, group.by = "DF.classifications_0.25_0.29_533")
# table of doublets and signlets
adolescent_data_seurat$DF.classifications_0.25_0.29_533 %>% table
.
Doublet Singlet
533 6568
# visualize the features in doublets and singlets
VlnPlot(adolescent_data_seurat, features = c("nCount_RNA", "nFeature_RNA", "percent.MT", "pANN_0.25_0.29_533"),
group.by = "DF.classifications_0.25_0.29_533", pt.size = 0.001)
Repeat doublet detection in adult sample
#Repeat the above analysis with the adult sample
sweep.res.list_adult <- paramSweep_v3(adult_data_seurat, PCs = 1:10, sct = FALSE)
[1] "Creating artificial doublets for pN = 5%"
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Defining neighborhoods..."
[1] "Computing pANN across all pK..."
[1] "pK = 0.005..."
[1] "pK = 0.01..."
[1] "pK = 0.02..."
[1] "pK = 0.03..."
[1] "pK = 0.04..."
[1] "pK = 0.05..."
[1] "pK = 0.06..."
[1] "pK = 0.07..."
[1] "pK = 0.08..."
[1] "pK = 0.09..."
[1] "pK = 0.1..."
[1] "pK = 0.11..."
[1] "pK = 0.12..."
[1] "pK = 0.13..."
[1] "pK = 0.14..."
[1] "pK = 0.15..."
[1] "pK = 0.16..."
[1] "pK = 0.17..."
[1] "pK = 0.18..."
[1] "pK = 0.19..."
[1] "pK = 0.2..."
[1] "pK = 0.21..."
[1] "pK = 0.22..."
[1] "pK = 0.23..."
[1] "pK = 0.24..."
[1] "pK = 0.25..."
[1] "pK = 0.26..."
[1] "pK = 0.27..."
[1] "pK = 0.28..."
[1] "pK = 0.29..."
[1] "pK = 0.3..."
[1] "Creating artificial doublets for pN = 10%"
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Defining neighborhoods..."
[1] "Computing pANN across all pK..."
[1] "pK = 0.005..."
[1] "pK = 0.01..."
[1] "pK = 0.02..."
[1] "pK = 0.03..."
[1] "pK = 0.04..."
[1] "pK = 0.05..."
[1] "pK = 0.06..."
[1] "pK = 0.07..."
[1] "pK = 0.08..."
[1] "pK = 0.09..."
[1] "pK = 0.1..."
[1] "pK = 0.11..."
[1] "pK = 0.12..."
[1] "pK = 0.13..."
[1] "pK = 0.14..."
[1] "pK = 0.15..."
[1] "pK = 0.16..."
[1] "pK = 0.17..."
[1] "pK = 0.18..."
[1] "pK = 0.19..."
[1] "pK = 0.2..."
[1] "pK = 0.21..."
[1] "pK = 0.22..."
[1] "pK = 0.23..."
[1] "pK = 0.24..."
[1] "pK = 0.25..."
[1] "pK = 0.26..."
[1] "pK = 0.27..."
[1] "pK = 0.28..."
[1] "pK = 0.29..."
[1] "pK = 0.3..."
[1] "Creating artificial doublets for pN = 15%"
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Defining neighborhoods..."
[1] "Computing pANN across all pK..."
[1] "pK = 0.005..."
[1] "pK = 0.01..."
[1] "pK = 0.02..."
[1] "pK = 0.03..."
[1] "pK = 0.04..."
[1] "pK = 0.05..."
[1] "pK = 0.06..."
[1] "pK = 0.07..."
[1] "pK = 0.08..."
[1] "pK = 0.09..."
[1] "pK = 0.1..."
[1] "pK = 0.11..."
[1] "pK = 0.12..."
[1] "pK = 0.13..."
[1] "pK = 0.14..."
[1] "pK = 0.15..."
[1] "pK = 0.16..."
[1] "pK = 0.17..."
[1] "pK = 0.18..."
[1] "pK = 0.19..."
[1] "pK = 0.2..."
[1] "pK = 0.21..."
[1] "pK = 0.22..."
[1] "pK = 0.23..."
[1] "pK = 0.24..."
[1] "pK = 0.25..."
[1] "pK = 0.26..."
[1] "pK = 0.27..."
[1] "pK = 0.28..."
[1] "pK = 0.29..."
[1] "pK = 0.3..."
[1] "Creating artificial doublets for pN = 20%"
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Defining neighborhoods..."
[1] "Computing pANN across all pK..."
[1] "pK = 0.005..."
[1] "pK = 0.01..."
[1] "pK = 0.02..."
[1] "pK = 0.03..."
[1] "pK = 0.04..."
[1] "pK = 0.05..."
[1] "pK = 0.06..."
[1] "pK = 0.07..."
[1] "pK = 0.08..."
[1] "pK = 0.09..."
[1] "pK = 0.1..."
[1] "pK = 0.11..."
[1] "pK = 0.12..."
[1] "pK = 0.13..."
[1] "pK = 0.14..."
[1] "pK = 0.15..."
[1] "pK = 0.16..."
[1] "pK = 0.17..."
[1] "pK = 0.18..."
[1] "pK = 0.19..."
[1] "pK = 0.2..."
[1] "pK = 0.21..."
[1] "pK = 0.22..."
[1] "pK = 0.23..."
[1] "pK = 0.24..."
[1] "pK = 0.25..."
[1] "pK = 0.26..."
[1] "pK = 0.27..."
[1] "pK = 0.28..."
[1] "pK = 0.29..."
[1] "pK = 0.3..."
[1] "Creating artificial doublets for pN = 25%"
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Defining neighborhoods..."
[1] "Computing pANN across all pK..."
[1] "pK = 0.005..."
[1] "pK = 0.01..."
[1] "pK = 0.02..."
[1] "pK = 0.03..."
[1] "pK = 0.04..."
[1] "pK = 0.05..."
[1] "pK = 0.06..."
[1] "pK = 0.07..."
[1] "pK = 0.08..."
[1] "pK = 0.09..."
[1] "pK = 0.1..."
[1] "pK = 0.11..."
[1] "pK = 0.12..."
[1] "pK = 0.13..."
[1] "pK = 0.14..."
[1] "pK = 0.15..."
[1] "pK = 0.16..."
[1] "pK = 0.17..."
[1] "pK = 0.18..."
[1] "pK = 0.19..."
[1] "pK = 0.2..."
[1] "pK = 0.21..."
[1] "pK = 0.22..."
[1] "pK = 0.23..."
[1] "pK = 0.24..."
[1] "pK = 0.25..."
[1] "pK = 0.26..."
[1] "pK = 0.27..."
[1] "pK = 0.28..."
[1] "pK = 0.29..."
[1] "pK = 0.3..."
[1] "Creating artificial doublets for pN = 30%"
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Defining neighborhoods..."
[1] "Computing pANN across all pK..."
[1] "pK = 0.005..."
[1] "pK = 0.01..."
[1] "pK = 0.02..."
[1] "pK = 0.03..."
[1] "pK = 0.04..."
[1] "pK = 0.05..."
[1] "pK = 0.06..."
[1] "pK = 0.07..."
[1] "pK = 0.08..."
[1] "pK = 0.09..."
[1] "pK = 0.1..."
[1] "pK = 0.11..."
[1] "pK = 0.12..."
[1] "pK = 0.13..."
[1] "pK = 0.14..."
[1] "pK = 0.15..."
[1] "pK = 0.16..."
[1] "pK = 0.17..."
[1] "pK = 0.18..."
[1] "pK = 0.19..."
[1] "pK = 0.2..."
[1] "pK = 0.21..."
[1] "pK = 0.22..."
[1] "pK = 0.23..."
[1] "pK = 0.24..."
[1] "pK = 0.25..."
[1] "pK = 0.26..."
[1] "pK = 0.27..."
[1] "pK = 0.28..."
[1] "pK = 0.29..."
[1] "pK = 0.3..."
sweep.stats_adult <- summarizeSweep(sweep.res.list_adult, GT = FALSE)
bcmvn_adult <- find.pK(sweep.stats_adult)
NULL
ggplot(bcmvn_adult, aes(x = pK, y = BCmetric, group = "Sweep")) + geom_point() + geom_line()
nExp_poi <- round(0.075*nrow(adult_data_seurat@meta.data))
adult_data_seurat <- doubletFinder_v3(adult_data_seurat, PCs = 1:10, pN = 0.25, pK = 0.08, nExp = nExp_poi, reuse.pANN = FALSE, sct = FALSE)
[1] "Creating 648 artificial doublets..."
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|==================================== | 50%
|
|========================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Computing pANN..."
[1] "Classifying doublets.."
adult_data_seurat <- UpdateSeuratObject(adult_data_seurat)
Validating object structure
Updating object slots
Ensuring keys are in the proper structure
Ensuring feature names don't have underscores or pipes
Object representation is consistent with the most current Seurat version
UMAPPlot(adult_data_seurat, group.by = "DF.classifications_0.25_0.08_146")
adult_data_seurat$DF.classifications_0.25_0.08_146 %>% table
.
Doublet Singlet
146 1797
VlnPlot(adult_data_seurat, features = c("nCount_RNA", "nFeature_RNA", "percent.MT", "pANN_0.25_0.08_146"),
group.by = "DF.classifications_0.25_0.08_146", pt.size = 0.001)
Remove doublets
# adolescent
# adult
Save the data objects for later
output_path <- "/Users/rhalenathomas/Documents/scRNAclubMcGill/workshop2023/"
saveRDS(adolescent_data_seurat, paste(output_path,"adolsecentSeurate.RDS"))
saveRDS(adult_data_seurat, paste(output_path,"adultSeurate.RDS"))